Instal relavent libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
library(ggcorrplot)
## Loading required package: ggplot2
library(caret)
## Loading required package: lattice
library(tidyr) 
library(ggplot2)
library(stringr)
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.3.3

Load 2014 CSV file for Analysis

data_2014 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2014.csv")
str(data_2014)
## 'data.frame':    42727 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  6830 128 8 168 85 352 100 59 23 5 ...
##  $ Tot_Suplrs              : int  8928 222 8 246 107 447 145 91 33 7 ...
##  $ Tot_Suplr_Benes         : int  12351 305 NA 293 177 504 163 97 43 NA ...
##  $ Tot_Suplr_Clms          : int  54724 1332 27 1151 843 2246 749 355 156 19 ...
##  $ Tot_Suplr_Srvcs         : int  5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  10.8 10.3 10.7 10.9 10.5 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.78 2.8 2.8 2.8 2.79 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.13 2.07 2.16 2.17 2.16 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.13 2.07 2.16 2.17 2.16 ...
# Convert empty strings or whitespace to NA
data_2014 <- data_2014 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2014))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1841                       10 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4810 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2014$Tot_Suplr_Benes)) / nrow(data_2014) * 100
missing_percentage
## [1] 11.25752
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2014$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2014) * 100
missing_percentage
## [1] 4.308751
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2014$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2014) * 100
missing_percentage
## [1] 0.0234044
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2014 <- data_2014[!is.na(data_2014$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2014 <- data_2014 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2014$Tot_Suplr_Benes <- as.integer(data_2014$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2014$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2014$Rfrg_Prvdr_Geo_Cd), "National", data_2014$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2014))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2014)
## 'data.frame':    42717 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  6830 128 8 168 85 352 100 59 23 5 ...
##  $ Tot_Suplrs              : int  8928 222 8 246 107 447 145 91 33 7 ...
##  $ Tot_Suplr_Benes         : int  12351 305 86 293 177 504 163 97 43 86 ...
##  $ Tot_Suplr_Clms          : int  54724 1332 27 1151 843 2246 749 355 156 19 ...
##  $ Tot_Suplr_Srvcs         : int  5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  10.8 10.3 10.7 10.9 10.5 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.78 2.8 2.8 2.8 2.79 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.13 2.07 2.16 2.17 2.16 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.13 2.07 2.16 2.17 2.16 ...

Load 2015 CSV file for Analysis

data_2015 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2015.csv")
str(data_2015)
## 'data.frame':    42372 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  7725 145 15 206 101 417 121 77 27 6 ...
##  $ Tot_Suplrs              : int  10048 258 17 301 129 519 165 103 40 10 ...
##  $ Tot_Suplr_Benes         : int  14759 373 17 377 209 620 194 108 59 NA ...
##  $ Tot_Suplr_Clms          : int  62771 1557 67 1433 960 2463 797 431 205 23 ...
##  $ Tot_Suplr_Srvcs         : int  6275925 172090 4880 140880 94520 213915 69950 53685 21130 1450 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  12.3 11.7 13.5 12.7 11.7 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.79 2.8 2.8 2.8 2.8 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.14 2.11 2.14 2.17 2.15 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.14 2.11 2.14 2.17 2.15 ...
# Convert empty strings or whitespace to NA
data_2015 <- data_2015 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2015))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1808                        8 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4920 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2015$Tot_Suplr_Benes)) / nrow(data_2015) * 100
missing_percentage
## [1] 11.61144
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2015$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2015) * 100
missing_percentage
## [1] 4.266969
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2015$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2015) * 100
missing_percentage
## [1] 0.01888039
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2015 <- data_2015[!is.na(data_2015$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2015 <- data_2015 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2015$Tot_Suplr_Benes <- as.integer(data_2015$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2015$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2015$Rfrg_Prvdr_Geo_Cd), "National", data_2015$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2015))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2015)
## 'data.frame':    42364 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  7725 145 15 206 101 417 121 77 27 6 ...
##  $ Tot_Suplrs              : int  10048 258 17 301 129 519 165 103 40 10 ...
##  $ Tot_Suplr_Benes         : int  14759 373 17 377 209 620 194 108 59 89 ...
##  $ Tot_Suplr_Clms          : int  62771 1557 67 1433 960 2463 797 431 205 23 ...
##  $ Tot_Suplr_Srvcs         : int  6275925 172090 4880 140880 94520 213915 69950 53685 21130 1450 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  12.3 11.7 13.5 12.7 11.7 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.79 2.8 2.8 2.8 2.8 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.14 2.11 2.14 2.17 2.15 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.14 2.11 2.14 2.17 2.15 ...

Load 2016 CSV file for Analysis

data_2016 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2016.csv")
str(data_2016)
## 'data.frame':    42259 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  8346 145 13 215 102 451 146 85 29 11 ...
##  $ Tot_Suplrs              : int  11338 280 14 317 149 591 221 125 51 17 ...
##  $ Tot_Suplr_Benes         : int  18408 450 17 448 261 739 263 133 78 15 ...
##  $ Tot_Suplr_Clms          : int  82003 2053 66 1784 1266 3299 1078 477 297 41 ...
##  $ Tot_Suplr_Srvcs         : int  8247311 217440 6130 176770 115590 284860 100410 55750 32130 5040 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  14.2 13.8 15.3 14.7 14 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.8 2.8 2.8 2.84 2.8 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.16 2.13 2.13 2.2 2.15 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.16 2.13 2.13 2.2 2.15 ...
# Convert empty strings or whitespace to NA
data_2016 <- data_2016 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2016))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1791                        6 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4840 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2016$Tot_Suplr_Benes)) / nrow(data_2016) * 100
missing_percentage
## [1] 11.45318
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2016$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2016) * 100
missing_percentage
## [1] 4.23815
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2016$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2016) * 100
missing_percentage
## [1] 0.01419816
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2016 <- data_2016[!is.na(data_2016$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2016 <- data_2016 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2016$Tot_Suplr_Benes <- as.integer(data_2016$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2016$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2016$Rfrg_Prvdr_Geo_Cd), "National", data_2016$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2016))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2016)
## 'data.frame':    42253 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  8346 145 13 215 102 451 146 85 29 11 ...
##  $ Tot_Suplrs              : int  11338 280 14 317 149 591 221 125 51 17 ...
##  $ Tot_Suplr_Benes         : int  18408 450 17 448 261 739 263 133 78 15 ...
##  $ Tot_Suplr_Clms          : int  82003 2053 66 1784 1266 3299 1078 477 297 41 ...
##  $ Tot_Suplr_Srvcs         : int  8247311 217440 6130 176770 115590 284860 100410 55750 32130 5040 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  14.2 13.8 15.3 14.7 14 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.8 2.8 2.8 2.84 2.8 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.16 2.13 2.13 2.2 2.15 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.16 2.13 2.13 2.2 2.15 ...

Load 2017 CSV file for Analysis

data_2017 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2017.csv")
str(data_2017)
## 'data.frame':    42127 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  9621 164 22 251 118 540 145 102 33 14 ...
##  $ Tot_Suplrs              : int  12571 307 23 379 171 725 234 138 62 21 ...
##  $ Tot_Suplr_Benes         : int  21648 495 27 545 290 938 294 170 103 21 ...
##  $ Tot_Suplr_Clms          : int  97398 2211 87 2117 1573 4158 1284 626 335 69 ...
##  $ Tot_Suplr_Srvcs         : int  9867276 233770 8240 205464 140650 358590 113030 89822 44740 7680 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.3 14.8 16.4 16 14.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  9.61 9.47 9.75 9.73 9.55 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  7.4 7.14 7.6 7.54 7.21 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  7.4 7.14 7.6 7.54 7.21 ...
# Convert empty strings or whitespace to NA
data_2017 <- data_2017 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2017))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1807                       38 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4842 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2017$Tot_Suplr_Benes)) / nrow(data_2017) * 100
missing_percentage
## [1] 11.49382
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2017$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2017) * 100
missing_percentage
## [1] 4.289411
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2017$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2017) * 100
missing_percentage
## [1] 0.09020343
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2017 <- data_2017[!is.na(data_2017$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2017 <- data_2017 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2017$Tot_Suplr_Benes <- as.integer(data_2017$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2017$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2017$Rfrg_Prvdr_Geo_Cd), "National", data_2017$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2017))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2017)
## 'data.frame':    42089 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  9621 164 22 251 118 540 145 102 33 14 ...
##  $ Tot_Suplrs              : int  12571 307 23 379 171 725 234 138 62 21 ...
##  $ Tot_Suplr_Benes         : int  21648 495 27 545 290 938 294 170 103 21 ...
##  $ Tot_Suplr_Clms          : int  97398 2211 87 2117 1573 4158 1284 626 335 69 ...
##  $ Tot_Suplr_Srvcs         : int  9867276 233770 8240 205464 140650 358590 113030 89822 44740 7680 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.3 14.8 16.4 16 14.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  9.61 9.47 9.75 9.73 9.55 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  7.4 7.14 7.6 7.54 7.21 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  7.4 7.14 7.6 7.54 7.21 ...

Load 2018 CSV file for Analysis

data_2018 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2018.csv")
str(data_2018)
## 'data.frame':    42397 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  10626 173 23 269 111 612 190 108 31 16 ...
##  $ Tot_Suplrs              : int  14509 346 24 409 194 879 292 155 65 30 ...
##  $ Tot_Suplr_Benes         : int  26102 591 30 630 353 1202 414 197 113 30 ...
##  $ Tot_Suplr_Clms          : int  113148 2624 96 2425 1809 4985 1617 711 357 95 ...
##  $ Tot_Suplr_Srvcs         : int  11902027 290663 9620 249205 167920 461317 146890 90050 57000 13000 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.6 14.6 17.5 16.3 15.3 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.5 10 10.6 10.7 10.4 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  8.12 7.65 8.31 8.3 7.99 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  8.12 7.65 8.31 8.3 7.99 ...
# Convert empty strings or whitespace to NA
data_2018 <- data_2018 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2018))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1793                       29 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4667 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2018$Tot_Suplr_Benes)) / nrow(data_2018) * 100
missing_percentage
## [1] 11.00785
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2018$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2018) * 100
missing_percentage
## [1] 4.229073
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2018$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2018) * 100
missing_percentage
## [1] 0.06840107
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2018 <- data_2018[!is.na(data_2018$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2018 <- data_2018 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2018$Tot_Suplr_Benes <- as.integer(data_2018$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2018$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2018$Rfrg_Prvdr_Geo_Cd), "National", data_2018$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2018))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2018)
## 'data.frame':    42368 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  10626 173 23 269 111 612 190 108 31 16 ...
##  $ Tot_Suplrs              : int  14509 346 24 409 194 879 292 155 65 30 ...
##  $ Tot_Suplr_Benes         : int  26102 591 30 630 353 1202 414 197 113 30 ...
##  $ Tot_Suplr_Clms          : int  113148 2624 96 2425 1809 4985 1617 711 357 95 ...
##  $ Tot_Suplr_Srvcs         : int  11902027 290663 9620 249205 167920 461317 146890 90050 57000 13000 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.6 14.6 17.5 16.3 15.3 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.5 10 10.6 10.7 10.4 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  8.12 7.65 8.31 8.3 7.99 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  8.12 7.65 8.31 8.3 7.99 ...

Load 2019 CSV file for Analysis

data_2019 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2019.csv")
str(data_2019)
## 'data.frame':    42198 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  11583 181 29 286 123 653 214 113 33 16 ...
##  $ Tot_Suplrs              : int  15921 354 28 437 207 986 334 186 77 30 ...
##  $ Tot_Suplr_Benes         : int  31138 695 35 725 433 1438 500 226 138 27 ...
##  $ Tot_Suplr_Clms          : int  134823 3109 112 2780 2263 5952 2052 812 421 101 ...
##  $ Tot_Suplr_Srvcs         : int  14425456 366840 11500 288320 200590 565320 187550 99900 64860 10680 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  16.1 14.7 17.7 16.6 15.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.7 10 10.9 10.9 10.7 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  8.31 7.7 8.55 8.47 8.21 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  8.31 7.7 8.55 8.47 8.21 ...
# Convert empty strings or whitespace to NA
data_2019 <- data_2019 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2019))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1787                       33 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4662 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2019$Tot_Suplr_Benes)) / nrow(data_2019) * 100
missing_percentage
## [1] 11.04792
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2019$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2019) * 100
missing_percentage
## [1] 4.234798
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2019$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2019) * 100
missing_percentage
## [1] 0.07820276
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2019 <- data_2019[!is.na(data_2019$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2019 <- data_2019 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2019$Tot_Suplr_Benes <- as.integer(data_2019$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2019$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2019$Rfrg_Prvdr_Geo_Cd), "National", data_2019$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2019))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2019)
## 'data.frame':    42165 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  11583 181 29 286 123 653 214 113 33 16 ...
##  $ Tot_Suplrs              : int  15921 354 28 437 207 986 334 186 77 30 ...
##  $ Tot_Suplr_Benes         : int  31138 695 35 725 433 1438 500 226 138 27 ...
##  $ Tot_Suplr_Clms          : int  134823 3109 112 2780 2263 5952 2052 812 421 101 ...
##  $ Tot_Suplr_Srvcs         : int  14425456 366840 11500 288320 200590 565320 187550 99900 64860 10680 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  16.1 14.7 17.7 16.6 15.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.7 10 10.9 10.9 10.7 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  8.31 7.7 8.55 8.47 8.21 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  8.31 7.7 8.55 8.47 8.21 ...

Load 2020 CSV file for Analysis

data_2020 <- read.csv("Medicare_DME_Devices_Supplies_by_Geography_and_Service_2020.csv")
str(data_2020)
## 'data.frame':    41098 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  11718 194 28 276 120 648 223 125 34 16 ...
##  $ Tot_Suplrs              : int  16373 365 26 445 224 1035 345 216 72 31 ...
##  $ Tot_Suplr_Benes         : int  33547 728 36 804 477 1584 540 257 128 32 ...
##  $ Tot_Suplr_Clms          : int  146868 3289 141 2944 2629 6515 2236 909 435 106 ...
##  $ Tot_Suplr_Srvcs         : int  16113154 458510 13450 317610 235900 646460 209820 110510 59550 9780 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.5 12.5 14.2 15.8 14.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.13 8.41 10.01 10.27 10.02 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  7.97 6.54 7.94 8.1 7.83 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  7.86 6.46 7.82 7.99 7.72 ...
# Convert empty strings or whitespace to NA
data_2020 <- data_2020 %>%
  mutate(across(everything(), ~ ifelse(. == "" | str_trim(.) == "", NA, .)))

# Check for missing values in each column
colSums(is.na(data_2020))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                     1723                        2 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                     4782 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Calculate the percentage of missing values in the 'Tot_Suplr_Benes' column
missing_percentage <- sum(is.na(data_2020$Tot_Suplr_Benes)) / nrow(data_2020) * 100
missing_percentage
## [1] 11.6356
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Cd' column
missing_percentage <- sum(is.na(data_2020$Rfrg_Prvdr_Geo_Cd )) / nrow(data_2020) * 100
missing_percentage
## [1] 4.192418
# Calculate the percentage of missing values in the 'Rfrg_Prvdr_Geo_Desc ' column
missing_percentage <- sum(is.na(data_2020$Rfrg_Prvdr_Geo_Desc  )) / nrow(data_2020) * 100
missing_percentage
## [1] 0.004866417
# Remove rows with NA in the Rfrg_Prvdr_Geo_Desc column
data_2020 <- data_2020[!is.na(data_2020$Rfrg_Prvdr_Geo_Desc), ]

# Replaces missing values in the 'Tot_Suplr_Benes' column with the median of that column.
data_2020 <- data_2020 %>%
  mutate(Tot_Suplr_Benes = ifelse(is.na(Tot_Suplr_Benes), median(Tot_Suplr_Benes, na.rm = TRUE), Tot_Suplr_Benes))

#convert the column back to integer after the imputation
data_2020$Tot_Suplr_Benes <- as.integer(data_2020$Tot_Suplr_Benes)

# Replace NA values in 'Rfrg_Prvdr_Geo_Cd' with "National"
data_2020$Rfrg_Prvdr_Geo_Cd <- ifelse(is.na(data_2020$Rfrg_Prvdr_Geo_Cd), "National", data_2020$Rfrg_Prvdr_Geo_Cd)

# Check for missing values in each updated column
colSums(is.na(data_2020))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0
# Check the updated column
str(data_2020)
## 'data.frame':    41096 obs. of  18 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  11718 194 28 276 120 648 223 125 34 16 ...
##  $ Tot_Suplrs              : int  16373 365 26 445 224 1035 345 216 72 31 ...
##  $ Tot_Suplr_Benes         : int  33547 728 36 804 477 1584 540 257 128 32 ...
##  $ Tot_Suplr_Clms          : int  146868 3289 141 2944 2629 6515 2236 909 435 106 ...
##  $ Tot_Suplr_Srvcs         : int  16113154 458510 13450 317610 235900 646460 209820 110510 59550 9780 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  15.5 12.5 14.2 15.8 14.8 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  10.13 8.41 10.01 10.27 10.02 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  7.97 6.54 7.94 8.1 7.83 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  7.86 6.46 7.82 7.99 7.72 ...
# Add the Year column to each dataset
data_2014$Year <- as.integer(2014)
data_2015$Year <- as.integer(2015)
data_2016$Year <- as.integer(2016)
data_2017$Year <- as.integer(2017)
data_2018$Year <- as.integer(2018)
data_2019$Year <- as.integer(2019)
data_2020$Year <- as.integer(2020)

Combine all the Data

# Combine all data frames into one
combined_data <- rbind(data_2014, data_2015, data_2016, data_2017, data_2018, data_2019, data_2020)

# View the combined data
head(combined_data)
##   Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 1           National          National            National
## 2              State                01             Alabama
## 3              State                02              Alaska
## 4              State                04             Arizona
## 5              State                05            Arkansas
## 6              State                06          California
##                         RBCS_Lvl RBCS_Id                          RBCS_Desc
## 1 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
## 2 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
## 3 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
## 4 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
## 5 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
## 6 Drugs Administered Through DME  DG000N DME-Drugs Administered Through DME
##   HCPCS_Cd
## 1    J1817
## 2    J1817
## 3    J1817
## 4    J1817
## 5    J1817
## 6    J1817
##                                                                 HCPCS_Desc
## 1 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 2 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 3 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 4 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 5 Insulin for administration through dme (i.e., insulin pump) per 50 units
## 6 Insulin for administration through dme (i.e., insulin pump) per 50 units
##   Suplr_Rentl_Ind Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms
## 1               N            6830       8928           12351          54724
## 2               N             128        222             305           1332
## 3               N               8          8              86             27
## 4               N             168        246             293           1151
## 5               N              85        107             177            843
## 6               N             352        447             504           2246
##   Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
## 1         5258684             10.76232                 2.783783
## 2          138847             10.25206                 2.796111
## 3            2160             10.69145                 2.800000
## 4          106080             10.87012                 2.800000
## 5           77090             10.45740                 2.791503
## 6          184666             10.79762                 2.799330
##   Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt Year
## 1                2.131986                 2.131986 2014
## 2                2.067193                 2.067193 2014
## 3                2.164935                 2.164935 2014
## 4                2.165747                 2.165747 2014
## 5                2.160118                 2.160118 2014
## 6                2.152167                 2.152167 2014
#  Write the combined data to a new CSV file
#write.csv(combined_data, "combined_data.csv", row.names = FALSE)
# View the structure of the data
str(combined_data)
## 'data.frame':    295052 obs. of  19 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "National" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "National" "01" "02" "04" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "National" "Alabama" "Alaska" "Arizona" ...
##  $ RBCS_Lvl                : chr  "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" "Drugs Administered Through DME" ...
##  $ RBCS_Id                 : chr  "DG000N" "DG000N" "DG000N" "DG000N" ...
##  $ RBCS_Desc               : chr  "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" "DME-Drugs Administered Through DME" ...
##  $ HCPCS_Cd                : chr  "J1817" "J1817" "J1817" "J1817" ...
##  $ HCPCS_Desc              : chr  "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" "Insulin for administration through dme (i.e., insulin pump) per 50 units" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  6830 128 8 168 85 352 100 59 23 5 ...
##  $ Tot_Suplrs              : int  8928 222 8 246 107 447 145 91 33 7 ...
##  $ Tot_Suplr_Benes         : int  12351 305 86 293 177 504 163 97 43 86 ...
##  $ Tot_Suplr_Clms          : int  54724 1332 27 1151 843 2246 749 355 156 19 ...
##  $ Tot_Suplr_Srvcs         : int  5258684 138847 2160 106080 77090 184666 61110 45340 15360 1540 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  10.8 10.3 10.7 10.9 10.5 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  2.78 2.8 2.8 2.8 2.79 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2.13 2.07 2.16 2.17 2.16 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2.13 2.07 2.16 2.17 2.16 ...
##  $ Year                    : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# Check for missing values in combined_data column
colSums(is.na(combined_data))
##       Rfrg_Prvdr_Geo_Lvl        Rfrg_Prvdr_Geo_Cd      Rfrg_Prvdr_Geo_Desc 
##                        0                        0                        0 
##                 RBCS_Lvl                  RBCS_Id                RBCS_Desc 
##                        0                        0                        0 
##                 HCPCS_Cd               HCPCS_Desc          Suplr_Rentl_Ind 
##                        0                        0                        0 
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##                        0                        0                        0 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##                        0                        0                        0 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##                        0                        0                        0 
##                     Year 
##                        0
# Find integer columns ( discrete)
int_cols <- sapply(combined_data, is.integer)
discrete_vars <- names(combined_data)[int_cols]
print("Discrete Variables (Integers):")
## [1] "Discrete Variables (Integers):"
print(discrete_vars)
## [1] "Tot_Rfrg_Prvdrs" "Tot_Suplrs"      "Tot_Suplr_Benes" "Tot_Suplr_Clms" 
## [5] "Tot_Suplr_Srvcs" "Year"
# Identify continuous variables (numeric but not integers)
num_cols <- sapply(combined_data, is.numeric)
continuous_vars <- names(combined_data)[num_cols & !int_cols]
print("Continuous Variables (Numeric):")
## [1] "Continuous Variables (Numeric):"
print(continuous_vars)
## [1] "Avg_Suplr_Sbmtd_Chrg"     "Avg_Suplr_Mdcr_Alowd_Amt"
## [3] "Avg_Suplr_Mdcr_Pymt_Amt"  "Avg_Suplr_Mdcr_Stdzd_Amt"
# Identify boolean columns (binary values)
boolean_columns <- sapply(combined_data, function(x) is.character(x) && length(unique(x)) == 2)
print(names(combined_data)[boolean_columns])
## [1] "Rfrg_Prvdr_Geo_Lvl" "Suplr_Rentl_Ind"
# Identify nominal columns (categorical & more than 2 unique value)
nominal_columns <- sapply(combined_data, function(x) is.character(x) && length(unique(x)) > 2)

print(names(combined_data)[nominal_columns])
## [1] "Rfrg_Prvdr_Geo_Cd"   "Rfrg_Prvdr_Geo_Desc" "RBCS_Lvl"           
## [4] "RBCS_Id"             "RBCS_Desc"           "HCPCS_Cd"           
## [7] "HCPCS_Desc"

Duplicate Analysis

# Check for duplicate rows in the combined_data
duplicate_rows <- sum(duplicated(combined_data))

# Print the number of duplicate rows
print(paste("Number of duplicate rows: ", duplicate_rows))
## [1] "Number of duplicate rows:  0"

Normalization

# Loop through each continuous variable and plot histograms
for (col_name in continuous_vars) {
  
  # Extract the data for the current column
  column_data <- combined_data[[col_name]]
  
# Plot histogram with a normal curve using plotNormalHistogram()
  plotNormalHistogram(column_data, 
                      main = paste("Histogram of", col_name), 
                      xlab = col_name)
}

# Subsample 5000 rows from each continuous column 
subsample_Avg_Suplr_Sbmtd_Chrg <- sample(combined_data[["Avg_Suplr_Sbmtd_Chrg"]], 5000)

# Apply the Shapiro-Wilk test
print(shapiro.test(subsample_Avg_Suplr_Sbmtd_Chrg))
## 
##  Shapiro-Wilk normality test
## 
## data:  subsample_Avg_Suplr_Sbmtd_Chrg
## W = 0.30569, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Alowd_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Alowd_Amt"]], 5000)
print(shapiro.test(subsample_Avg_Suplr_Mdcr_Alowd_Amt))
## 
##  Shapiro-Wilk normality test
## 
## data:  subsample_Avg_Suplr_Mdcr_Alowd_Amt
## W = 0.24844, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Pymt_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Pymt_Amt"]], 5000)
print( shapiro.test(subsample_Avg_Suplr_Mdcr_Pymt_Amt))
## 
##  Shapiro-Wilk normality test
## 
## data:  subsample_Avg_Suplr_Mdcr_Pymt_Amt
## W = 0.24635, p-value < 2.2e-16
subsample_Avg_Suplr_Mdcr_Stdzd_Amt <- sample(combined_data[["Avg_Suplr_Mdcr_Stdzd_Amt"]], 5000)
print( shapiro.test(subsample_Avg_Suplr_Mdcr_Stdzd_Amt))
## 
##  Shapiro-Wilk normality test
## 
## data:  subsample_Avg_Suplr_Mdcr_Stdzd_Amt
## W = 0.2375, p-value < 2.2e-16
# Scale the continuous variables without modifying the original dataset
scaled_data <- scale(combined_data[continuous_vars])
# View the scaled dataset (just the scaled variables)
head(scaled_data)
##   Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt
## 1           -0.2985382               -0.2846836              -0.2841732
## 2           -0.2988250               -0.2846732              -0.2842437
## 3           -0.2985781               -0.2846699              -0.2841374
## 4           -0.2984776               -0.2846699              -0.2841365
## 5           -0.2987096               -0.2846771              -0.2841426
## 6           -0.2985184               -0.2846705              -0.2841513
##   Avg_Suplr_Mdcr_Stdzd_Amt
## 1               -0.2853697
## 2               -0.2854403
## 3               -0.2853339
## 4               -0.2853330
## 5               -0.2853391
## 6               -0.2853478
# Loop through each numeric column in the scaled data and plot a histogram
for (i in 1:ncol(scaled_data)) {
  # Plot histogram with a normal curve
    plotNormalHistogram(scaled_data, 
                        main = paste("Histogram of scaled_data", col_name),
                        xlab = col_name)
}

# Loop through each continuous variable and apply square root transformation
for (col_name in continuous_vars) {
  
  # Apply square root transformation
  transformed_data <- sqrt(combined_data[[col_name]])
  
    # Plot histogram with a normal curve
    plotNormalHistogram(transformed_data, 
                        main = paste("Histogram of T_Square", col_name),
                        xlab = col_name)
  }

# Apply cube root transformation and plot histograms
for (col_name in continuous_vars) {
  
  # Apply cube root transformation
  T_cub <- sign(combined_data[[col_name]]) * abs(combined_data[[col_name]])^(1/3)
  
  # plot histograms with normal distribution overlay
  plotNormalHistogram(T_cub, 
                      main = paste("Histogram of T_Cube", col_name), 
                      xlab = col_name)
}

# Loop through each continuous variable, apply log transformation, and plot histograms
for (col_name in continuous_vars) {
  
 # Apply log transformation with a small constant added
T_log <- log(combined_data[[col_name]] + 1)  # Add 1 to avoid log(0) errors
  # plot histograms with normal distribution overlay
  plotNormalHistogram(T_log, 
                      main = paste("Histogram of T_log", col_name), 
                      xlab = col_name) 

   # QQ Plot for log-transformed variable
  qqnorm(T_log, 
         ylab = paste("Sample Quantiles for", col_name), 
         main = paste("QQ Plot ", col_name))
  
  # Add QQ line
  qqline(T_log, 
         col = "red")
}

# Randomly sample 5000 rows from the dataset
subsample <- sample(T_log, 5000)

# Apply the Shapiro-Wilk test to the subsample
shapiro.test(subsample)
## 
##  Shapiro-Wilk normality test
## 
## data:  subsample
## W = 0.98642, p-value < 2.2e-16
# Calculate Spearman's Rank Correlation for continuous variables
spearman_correlation_matrix <- cor(combined_data[continuous_vars], method = "spearman")

# View the Spearman correlation matrix
print(spearman_correlation_matrix)
##                          Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
## Avg_Suplr_Sbmtd_Chrg                1.0000000                0.9648672
## Avg_Suplr_Mdcr_Alowd_Amt            0.9648672                1.0000000
## Avg_Suplr_Mdcr_Pymt_Amt             0.9643239                0.9997474
## Avg_Suplr_Mdcr_Stdzd_Amt            0.9597192                0.9974304
##                          Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
## Avg_Suplr_Sbmtd_Chrg                   0.9643239                0.9597192
## Avg_Suplr_Mdcr_Alowd_Amt               0.9997474                0.9974304
## Avg_Suplr_Mdcr_Pymt_Amt                1.0000000                0.9976300
## Avg_Suplr_Mdcr_Stdzd_Amt               0.9976300                1.0000000
# Loop through each discrete variable and plot histograms
for (col_name in discrete_vars) {
  
  # Extract the data for the current column
  column_data <- combined_data[[col_name]]
  
  # Plot histogram for the discrete variable with normal distribution overlay
   plotNormalHistogram(column_data, 
                      main = paste("Histogram of Discrete", col_name), 
                      xlab = col_name)
}

# Loop through each discrete variable and create a bar plot
for (col_name in discrete_vars) {
  # Create the bar plot
  barplot(table(combined_data[[col_name]]), 
          main = paste("Distribution of", col_name), 
          xlab = col_name, 
          ylab = "Frequency",
          col = "red")
          
}

# Calculate Spearman's Rank Correlation for discrete variables
spearman_correlation_matrix <- cor(combined_data[discrete_vars], method = "spearman")

# View the Spearman correlation matrix
print(spearman_correlation_matrix)
##                 Tot_Rfrg_Prvdrs  Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms
## Tot_Rfrg_Prvdrs     1.000000000  0.89837491      0.80542044     0.89775162
## Tot_Suplrs          0.898374912  1.00000000      0.73070515     0.81571093
## Tot_Suplr_Benes     0.805420442  0.73070515      1.00000000     0.81571183
## Tot_Suplr_Clms      0.897751625  0.81571093      0.81571183     1.00000000
## Tot_Suplr_Srvcs     0.665880706  0.61512445      0.65662987     0.83173216
## Year               -0.004721095 -0.01391891      0.01057104     0.01227504
##                 Tot_Suplr_Srvcs         Year
## Tot_Rfrg_Prvdrs      0.66588071 -0.004721095
## Tot_Suplrs           0.61512445 -0.013918909
## Tot_Suplr_Benes      0.65662987  0.010571036
## Tot_Suplr_Clms       0.83173216  0.012275038
## Tot_Suplr_Srvcs      1.00000000  0.017659445
## Year                 0.01765944  1.000000000
# Filter out only numeric columns from combined_data
numeric_data <- combined_data[, sapply(combined_data, is.numeric)]

# Calculate the Spearman correlation matrix for numeric columns
spearman_cor <- cor(numeric_data, method = "spearman", use = "complete.obs")

# Generate a heat map for the Spearman correlation matrix
corrplot(spearman_cor, method = "color", type = "ful",tl.cex = 0.5, addCoef.col = "black")

#  Barplot for binary categorical variable 
# Suplr_Rentl_Ind
barplot(table(combined_data$Suplr_Rentl_Ind), main = "Barplot of Suplr_Rentl_Ind", 
        xlab = "Supplier Rental Indicator", col = "orange")

#Rfrg_Prvdr_Geo_Lvl 
barplot(table(combined_data$Rfrg_Prvdr_Geo_Lvl), main = "Barplot of Rfrg_Prvdr_Geo_Lvl", 
        xlab = "Referring Provider Geography Level ", col = "blue")

EDA analysis

# Install package
#install.packages("dataMaid")
# Import library
#library(dataMaid)
# Create report
#makeDataReport(combined_data, output = "html", replace = TRUE)

Defining underserved regions

#  Calculating percentiles for high charges and low service volumes
charge_threshold_high <- quantile(combined_data$Avg_Suplr_Sbmtd_Chrg, 0.95, na.rm = TRUE)  # 90th percentile for charges
service_threshold_low <- quantile(combined_data$Tot_Suplr_Srvcs, 0.05, na.rm = TRUE)  # 10th percentile for services

# Flagging unusually high charges (above 95th percentile)
combined_data <- combined_data %>%
  mutate(high_charges = ifelse(Avg_Suplr_Sbmtd_Chrg >= charge_threshold_high, TRUE, FALSE))

# Flagging low service volumes (below 5th percentile)
combined_data <- combined_data %>%
  mutate(low_service_volumes = ifelse(Tot_Suplr_Srvcs <= service_threshold_low, TRUE, FALSE))

# Defining underserved regions where both conditions are met
combined_data <- combined_data %>%
  mutate(is_underserved = ifelse(high_charges == TRUE & low_service_volumes == TRUE, TRUE, FALSE))

# Summary statistics for underserved regions
underserved_regions <- combined_data %>% filter(is_underserved == TRUE)

summary(underserved_regions)
##  Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd  Rfrg_Prvdr_Geo_Desc   RBCS_Lvl        
##  Length:2068        Length:2068        Length:2068         Length:2068       
##  Class :character   Class :character   Class :character    Class :character  
##  Mode  :character   Mode  :character   Mode  :character    Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##    RBCS_Id           RBCS_Desc           HCPCS_Cd          HCPCS_Desc       
##  Length:2068        Length:2068        Length:2068        Length:2068       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Suplr_Rentl_Ind    Tot_Rfrg_Prvdrs   Tot_Suplrs     Tot_Suplr_Benes
##  Length:2068        Min.   : 1.00   Min.   : 1.000   Min.   :11.00  
##  Class :character   1st Qu.:10.00   1st Qu.: 4.000   1st Qu.:12.00  
##  Mode  :character   Median :12.00   Median : 7.000   Median :13.00  
##                     Mean   :11.35   Mean   : 6.676   Mean   :18.85  
##                     3rd Qu.:13.00   3rd Qu.: 9.000   3rd Qu.:15.00  
##                     Max.   :16.00   Max.   :16.000   Max.   :92.00  
##  Tot_Suplr_Clms  Tot_Suplr_Srvcs Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt
##  Min.   :11.00   Min.   :11.00   Min.   : 2425        Min.   :   82.3         
##  1st Qu.:12.00   1st Qu.:12.00   1st Qu.: 3282        1st Qu.: 2237.6         
##  Median :13.00   Median :13.00   Median : 4835        Median : 3285.3         
##  Mean   :13.11   Mean   :13.35   Mean   : 6989        Mean   : 4433.8         
##  3rd Qu.:14.00   3rd Qu.:15.00   3rd Qu.: 7887        3rd Qu.: 4946.6         
##  Max.   :16.00   Max.   :16.00   Max.   :70913        Max.   :39266.0         
##  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt      Year      high_charges  
##  Min.   :   60.18        Min.   :   58.22         Min.   :2014   Mode:logical  
##  1st Qu.: 1730.06        1st Qu.: 1714.62         1st Qu.:2015   TRUE:2068     
##  Median : 2540.24        Median : 2526.78         Median :2017                 
##  Mean   : 3442.03        Mean   : 3431.76         Mean   :2017                 
##  3rd Qu.: 3848.31        3rd Qu.: 3852.76         3rd Qu.:2019                 
##  Max.   :31071.56        Max.   :31019.03         Max.   :2020                 
##  low_service_volumes is_underserved
##  Mode:logical        Mode:logical  
##  TRUE:2068           TRUE:2068     
##                                    
##                                    
##                                    
## 
str(underserved_regions)
## 'data.frame':    2068 obs. of  22 variables:
##  $ Rfrg_Prvdr_Geo_Lvl      : chr  "State" "State" "State" "State" ...
##  $ Rfrg_Prvdr_Geo_Cd       : chr  "02" "10" "35" "50" ...
##  $ Rfrg_Prvdr_Geo_Desc     : chr  "Alaska" "Delaware" "New Mexico" "Vermont" ...
##  $ RBCS_Lvl                : chr  "Durable Medical Equipment" "Durable Medical Equipment" "Durable Medical Equipment" "Durable Medical Equipment" ...
##  $ RBCS_Id                 : chr  "DD000N" "DD000N" "DD000N" "DD000N" ...
##  $ RBCS_Desc               : chr  "DME-Wheelchairs" "DME-Wheelchairs" "DME-Wheelchairs" "DME-Wheelchairs" ...
##  $ HCPCS_Cd                : chr  "E1002" "E1002" "E1002" "E1002" ...
##  $ HCPCS_Desc              : chr  "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" "Wheelchair accessory, power seating system, tilt only" ...
##  $ Suplr_Rentl_Ind         : chr  "N" "N" "N" "N" ...
##  $ Tot_Rfrg_Prvdrs         : int  12 15 12 15 3 4 5 12 14 14 ...
##  $ Tot_Suplrs              : int  3 4 5 6 3 3 3 10 12 5 ...
##  $ Tot_Suplr_Benes         : int  12 15 12 15 86 86 86 12 14 15 ...
##  $ Tot_Suplr_Clms          : int  12 15 12 15 11 13 14 12 14 15 ...
##  $ Tot_Suplr_Srvcs         : int  12 15 12 15 11 13 14 12 14 15 ...
##  $ Avg_Suplr_Sbmtd_Chrg    : num  5252 6610 6727 5590 8442 ...
##  $ Avg_Suplr_Mdcr_Alowd_Amt: num  3820 3818 3789 3804 330 ...
##  $ Avg_Suplr_Mdcr_Pymt_Amt : num  2995 2993 2971 2982 259 ...
##  $ Avg_Suplr_Mdcr_Stdzd_Amt: num  2995 2993 2971 2982 259 ...
##  $ Year                    : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ high_charges            : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ low_service_volumes     : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ is_underserved          : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
#  View underserved regions
head(underserved_regions)
##      Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd Rfrg_Prvdr_Geo_Desc
## 7208              State                02              Alaska
## 7214              State                10            Delaware
## 7237              State                35          New Mexico
## 7251              State                50             Vermont
## 7261              State                19                Iowa
## 7264              State                27           Minnesota
##                       RBCS_Lvl RBCS_Id       RBCS_Desc HCPCS_Cd
## 7208 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
## 7214 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
## 7237 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
## 7251 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
## 7261 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
## 7264 Durable Medical Equipment  DD000N DME-Wheelchairs    E1002
##                                                 HCPCS_Desc Suplr_Rentl_Ind
## 7208 Wheelchair accessory, power seating system, tilt only               N
## 7214 Wheelchair accessory, power seating system, tilt only               N
## 7237 Wheelchair accessory, power seating system, tilt only               N
## 7251 Wheelchair accessory, power seating system, tilt only               N
## 7261 Wheelchair accessory, power seating system, tilt only               Y
## 7264 Wheelchair accessory, power seating system, tilt only               Y
##      Tot_Rfrg_Prvdrs Tot_Suplrs Tot_Suplr_Benes Tot_Suplr_Clms Tot_Suplr_Srvcs
## 7208              12          3              12             12              12
## 7214              15          4              15             15              15
## 7237              12          5              12             12              12
## 7251              15          6              15             15              15
## 7261               3          3              86             11              11
## 7264               4          3              86             13              13
##      Avg_Suplr_Sbmtd_Chrg Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt
## 7208             5252.083                3820.1150               2994.9717
## 7214             6609.733                3817.6147               2993.0113
## 7237             6726.528                3789.0050               2970.5817
## 7251             5590.318                3804.1313               2982.4393
## 7261             8441.909                 329.9191                258.6618
## 7264             3596.281                 328.6085                257.6338
##      Avg_Suplr_Mdcr_Stdzd_Amt Year high_charges low_service_volumes
## 7208                2994.9717 2014         TRUE                TRUE
## 7214                2993.0113 2014         TRUE                TRUE
## 7237                2970.5817 2014         TRUE                TRUE
## 7251                2982.4393 2014         TRUE                TRUE
## 7261                 258.6618 2014         TRUE                TRUE
## 7264                 270.7031 2014         TRUE                TRUE
##      is_underserved
## 7208           TRUE
## 7214           TRUE
## 7237           TRUE
## 7251           TRUE
## 7261           TRUE
## 7264           TRUE
# Filter the underserved regions
underserved_regions <- combined_data %>% filter(is_underserved == TRUE)

# View the first few rows with geographic data and year
head(underserved_regions %>% select(Year, Rfrg_Prvdr_Geo_Desc, Rfrg_Prvdr_Geo_Cd, Tot_Suplr_Srvcs, Avg_Suplr_Sbmtd_Chrg))
##      Year Rfrg_Prvdr_Geo_Desc Rfrg_Prvdr_Geo_Cd Tot_Suplr_Srvcs
## 7208 2014              Alaska                02              12
## 7214 2014            Delaware                10              15
## 7237 2014          New Mexico                35              12
## 7251 2014             Vermont                50              15
## 7261 2014                Iowa                19              11
## 7264 2014           Minnesota                27              13
##      Avg_Suplr_Sbmtd_Chrg
## 7208             5252.083
## 7214             6609.733
## 7237             6726.528
## 7251             5590.318
## 7261             8441.909
## 7264             3596.281
# Group underserved regions by year and geographic description
underserved_by_year_geo <- underserved_regions %>%
  group_by(Year, Rfrg_Prvdr_Geo_Desc) %>%
  summarise(Total_Underserved = n(),
            Avg_Supplier_Services = mean(Tot_Suplr_Srvcs, na.rm = TRUE),
            Avg_Supplier_Charges = mean(Avg_Suplr_Sbmtd_Chrg, na.rm = TRUE)) %>%
  arrange(Year, desc(Total_Underserved))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# View the underserved regions by year and geography
head(underserved_by_year_geo)
## # A tibble: 6 × 5
## # Groups:   Year [1]
##    Year Rfrg_Prvdr_Geo_Desc Total_Underserved Avg_Supplier_Services
##   <int> <chr>                           <int>                 <dbl>
## 1  2014 National                           17                  13.3
## 2  2014 Florida                            10                  13  
## 3  2014 Nevada                             10                  13  
## 4  2014 Georgia                             8                  14  
## 5  2014 New Mexico                          8                  13.4
## 6  2014 Oregon                              8                  14.1
## # ℹ 1 more variable: Avg_Supplier_Charges <dbl>
# Select the top 5 underserved regions for each year using the filtered dataset
top_underserved_by_year <- underserved_by_year_geo %>%
  group_by(Year) %>%
  slice_max(order_by = Total_Underserved, n = 5) %>%
  ungroup()

# Plot the top 5 underserved regions by year and geographic area
ggplot(top_underserved_by_year, 
       aes(x=reorder(Rfrg_Prvdr_Geo_Desc, -Total_Underserved), y=Total_Underserved, fill=factor(Year))) +
  geom_bar(stat="identity") +
  coord_flip() +  # Flip the coordinates to make the bar chart horizontal
  facet_wrap(~ Year, scales = "free_y") +  
  labs(title="Top 5 Underserved Regions by Year", x="Geographic Region", y="Total Underserved") +
  
  theme_minimal() 

# Plot trend of underserved regions over time
ggplot(underserved_by_year_geo, aes(x=Year, y=Total_Underserved, color=Rfrg_Prvdr_Geo_Desc)) +
  geom_line() +
  labs(title="Trend of Underserved Regions Over Time", x="Year", y="Total Underserved") +
  theme_minimal()

write.csv(combined_data, "combined_data.csv", row.names = FALSE)
# View a summary of the combined_data
summary(combined_data)
##  Rfrg_Prvdr_Geo_Lvl Rfrg_Prvdr_Geo_Cd  Rfrg_Prvdr_Geo_Desc   RBCS_Lvl        
##  Length:295052      Length:295052      Length:295052       Length:295052     
##  Class :character   Class :character   Class :character    Class :character  
##  Mode  :character   Mode  :character   Mode  :character    Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##    RBCS_Id           RBCS_Desc           HCPCS_Cd          HCPCS_Desc       
##  Length:295052      Length:295052      Length:295052      Length:295052     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Suplr_Rentl_Ind    Tot_Rfrg_Prvdrs      Tot_Suplrs       Tot_Suplr_Benes  
##  Length:295052      Min.   :     1.0   Min.   :    1.00   Min.   :     11  
##  Class :character   1st Qu.:    16.0   1st Qu.:    7.00   1st Qu.:     33  
##  Mode  :character   Median :    47.0   Median :   17.00   Median :     89  
##                     Mean   :   478.6   Mean   :   94.38   Mean   :   1669  
##                     3rd Qu.:   180.0   3rd Qu.:   49.00   3rd Qu.:    295  
##                     Max.   :271965.0   Max.   :49095.00   Max.   :3495668  
##  Tot_Suplr_Clms     Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg
##  Min.   :      11   Min.   :       11   Min.   :    0.01    
##  1st Qu.:      33   1st Qu.:       54   1st Qu.:   19.36    
##  Median :     121   Median :      305   Median :   92.21    
##  Mean   :    4371   Mean   :    83082   Mean   :  541.91    
##  3rd Qu.:     600   3rd Qu.:     3015   3rd Qu.:  393.33    
##  Max.   :10498977   Max.   :608845461   Max.   :70913.34    
##  Avg_Suplr_Mdcr_Alowd_Amt Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt
##  Min.   :    0.01         Min.   :    0.000       Min.   :    0.000       
##  1st Qu.:    9.49         1st Qu.:    7.157       1st Qu.:    7.377       
##  Median :   48.73         Median :   36.761       Median :   37.475       
##  Mean   :  340.29         Mean   :  263.389       Mean   :  264.212       
##  3rd Qu.:  206.05         3rd Qu.:  159.055       3rd Qu.:  162.317       
##  Max.   :39757.35         Max.   :31071.560       Max.   :31019.030       
##       Year      high_charges    low_service_volumes is_underserved 
##  Min.   :2014   Mode :logical   Mode :logical       Mode :logical  
##  1st Qu.:2015   FALSE:280299    FALSE:277781        FALSE:292984   
##  Median :2017   TRUE :14753     TRUE :17271         TRUE :2068     
##  Mean   :2017                                                      
##  3rd Qu.:2019                                                      
##  Max.   :2020
# Step 1: Identify numeric columns
num_cols <- combined_data[sapply(combined_data, is.numeric)]

# Step 2: Calculate standard deviation for each numeric column
sd_values <- sapply(num_cols, sd)

# Print the standard deviation values
print(sd_values)
##          Tot_Rfrg_Prvdrs               Tot_Suplrs          Tot_Suplr_Benes 
##             4.182212e+03             6.860545e+02             2.582794e+04 
##           Tot_Suplr_Clms          Tot_Suplr_Srvcs     Avg_Suplr_Sbmtd_Chrg 
##             7.647500e+04             2.317080e+06             1.779177e+03 
## Avg_Suplr_Mdcr_Alowd_Amt  Avg_Suplr_Mdcr_Pymt_Amt Avg_Suplr_Mdcr_Stdzd_Amt 
##             1.185538e+03             9.193592e+02             9.183867e+02 
##                     Year 
##             1.997253e+00